Read data
headings_em <- read_csv("data/raw/headings-emilian.csv")
headings_eo <- read_csv("data/raw/headings-esperanto.csv")
emilian <- read_csv("data/raw/emilian-clean.csv", skip = 1, col_names = headings_em$new, na = c("", "NA", "na")) %>%
mutate(
understand = factor(understand, levels = c("NO", "AL", "50/50", "G", "VG")),
speak = factor(speak, levels = c("NO", "AL", "50/50", "G", "VG"))
)
esperanto <- read_csv("data/raw/esperanto-clean.csv", skip = 1, col_names = headings_eo$new, na = c("", "NA", "na")) %>%
mutate(
age = str_remove(age, "jaroj"),
age = str_replace(age, "naskiĝis en la 1995a", "25"),
age = as.numeric(age),
understand = factor(understand, levels = c("NO", "AL", "50/50", "G", "VG")),
speak = factor(speak, levels = c("NO", "AL", "50/50", "G", "VG"))
)
Emilian
Participants
emilian %>%
ggplot(aes(gender)) +
geom_bar()

emilian %>%
ggplot(aes(age)) +
geom_histogram()

emilian %>%
ggplot(aes(age)) +
geom_density()

emilian %>%
ggplot(aes(education)) +
geom_bar()

emilian %>%
ggplot(aes(age, education)) +
geom_jitter(height = 0.2, alpha = 0.5)

emilian %>%
count(profession) %>%
ggplot(aes(reorder(profession, -n), n)) +
geom_bar(stat = "identity")

emilian %>%
count(languages_family) %>%
ggplot(aes(reorder(languages_family, -n), n)) +
geom_bar(stat = "identity")

emilian %>%
count(languages_parents) %>%
ggplot(aes(reorder(languages_parents, -n), n)) +
geom_bar(stat = "identity")

Language
Understand
emilian %>%
ggplot(aes(understand, fill = understand)) +
geom_bar() +
scale_fill_brewer(type = "div") +
theme_dark()

emilian %>%
ggplot(aes(understand, fill = gender)) +
geom_bar()

emilian %>%
ggplot(aes(understand, fill = gender)) +
geom_bar(position = "fill")

emilian %>%
ggplot(aes(age, fill = understand)) +
geom_histogram(binwidth = 5) +
facet_grid(understand ~ .)

emilian %>%
ggplot(aes(understand, fill = profession)) +
geom_bar()

emilian %>%
ggplot(aes(understand, fill = profession)) +
geom_bar(position = "fill")

Speak
emilian %>%
ggplot(aes(speak, fill = speak)) +
geom_bar() +
scale_fill_brewer(type = "div")

emilian %>%
ggplot(aes(speak, fill = gender)) +
geom_bar()

emilian %>%
ggplot(aes(speak, fill = gender)) +
geom_bar(position = "fill")

esperanto %>%
ggplot(aes(age, fill = speak)) +
geom_histogram(binwidth = 5) +
facet_grid(speak ~ .)

emilian %>%
ggplot(aes(speak, fill = profession)) +
geom_bar()

emilian %>%
ggplot(aes(speak, fill = profession)) +
geom_bar(position = "fill")

Read and write
emilian %>%
ggplot(aes(read_write, fill = read_write)) +
geom_bar()

emilian %>%
ggplot(aes(read_write, fill = gender)) +
geom_bar()

emilian %>%
ggplot(aes(read_write, fill = gender)) +
geom_bar(position = "fill")

emilian %>%
drop_na(read_write) %>%
ggplot(aes(age, fill = read_write)) +
geom_histogram(binwidth = 5) +
facet_grid(read_write ~ .)

emilian %>%
ggplot(aes(read_write, fill = profession)) +
geom_bar()

emilian %>%
ggplot(aes(read_write, fill = profession)) +
geom_bar(position = "fill")

Attitude
emilian %>%
select(educated:familiar) %>%
pivot_longer(educated:familiar, names_to = "feature", values_to = "rating") %>%
ggplot(aes(as.factor(rating), fill = as.factor(rating))) +
geom_bar() +
scale_fill_brewer() +
facet_grid(. ~ feature)

emilian %>%
select(educated:familiar) %>%
pivot_longer(educated:familiar, names_to = "feature", values_to = "rating") %>%
ggplot(aes(feature, fill = as.factor(rating))) +
geom_bar(position = "fill") +
scale_fill_brewer()

Urban vs Rural
emil_rur <- emilian %>%
mutate(
ru_ur = ifelse(
str_detect(birth_place, "-RU"), "rural",
ifelse(
str_detect(birth_place, "-UR"), "urban",
NA
)
)
)
emil_rur_clean <- emil_rur %>%
select(id, understand, speak, read_write, educated:familiar, ru_ur) %>%
mutate(
understand = ordered(understand, levels = c("NO", "AL", "50/50", "G", "VG")),
speak = ordered(speak, levels = c("NO", "AL", "50/50", "G", "VG")),
across(educated:familiar, ~ as.ordered(.x))
) %>%
drop_na()
emil_rur %>%
drop_na(ru_ur) %>%
ggplot(aes(ru_ur, fill = understand)) +
geom_bar()

emil_rur %>%
drop_na(ru_ur) %>%
ggplot(aes(ru_ur, fill = understand)) +
geom_bar(position = "fill")

Modelling
lang_use <- bind_rows(
emilian %>% mutate(language = "emilian"),
esperanto %>% mutate(language = "esperanto")
) %>%
select(id, age, understand, speak, read_write, language, educated:familiar) %>%
mutate(
understand = ordered(understand, levels = c("NO", "AL", "50/50", "G", "VG")),
speak = ordered(speak, levels = c("NO", "AL", "50/50", "G", "VG")),
across(educated:familiar, ~ as.ordered(.x)),
mean_comp_raw = as.numeric(understand) +
as.numeric(speak) +
ifelse(read_write == "Yes", 5, 0),
mean_comp = mean_comp_raw / 15
) %>%
drop_na()
attitudes <- lang_use %>%
select(educated:familiar)
MCA
attitudes_mca <- MCA(attitudes, graph = FALSE)
attitudes_dims <- attitudes_mca[["ind"]][["coord"]]
fviz_screeplot(attitudes_mca, addlabels = TRUE, ylim = c(0, 15))

fviz_mca_biplot(attitudes_mca, repel = TRUE)

fviz_mca_var(attitudes_mca, choice = "mca.cor", repel = TRUE)

fviz_mca_var(attitudes_mca, col.var = "black", shape.var = 15, repel = TRUE)

fviz_mca_var(attitudes_mca, col.var = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE)

fviz_cos2(attitudes_mca, choice = "var", axes = 1:2)

fviz_contrib(attitudes_mca, choice = "var", axes = 1, top = 15)

fviz_contrib(attitudes_mca, choice = "var", axes = 2, top = 15)

fviz_contrib(attitudes_mca, choice = "var", axes = 1:2, top = 15)

fviz_mca_ind(attitudes_mca, label = "none", habillage = "trustworthy")

fviz_mca_ind(attitudes_mca, label = "none", habillage = "friendly")

fviz_mca_ind(attitudes_mca, label = "none", habillage = "kind")

fviz_mca_ind(attitudes_mca, label = "none", habillage = "familiar")

Understand
lang_use$dim_1 <- -attitudes_dims[, 1]
get_prior(
understand ~
language * dim_1,
data = lang_use,
family = cumulative()
)
## prior class coef group resp dpar nlpar
## (flat) b
## (flat) b dim_1
## (flat) b languageesperanto
## (flat) b languageesperanto:dim_1
## student_t(3, 0, 2.5) Intercept
## student_t(3, 0, 2.5) Intercept 1
## student_t(3, 0, 2.5) Intercept 2
## student_t(3, 0, 2.5) Intercept 3
## student_t(3, 0, 2.5) Intercept 4
## bound source
## default
## (vectorized)
## (vectorized)
## (vectorized)
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
understand_priors <- c(
prior(normal(0, 3), class = Intercept),
prior(normal(0, 1), class = b)
)
understand_pchek <- brm(
understand ~
language * dim_1,
data = lang_use,
family = cumulative(),
prior = understand_priors,
sample_prior = "only",
backend = "cmdstanr",
cores = 4,
file = "./data/rds/understand_pchek"
)
conditional_effects(understand_pchek, effects = "dim_1", conditions = make_conditions(lang_use, "language"), categorical = TRUE)

understand_bm <- brm(
understand ~
language * dim_1,
data = lang_use,
family = cumulative(),
prior = understand_priors,
backend = "cmdstanr",
cores = 4,
file = "./data/rds/understand_bm"
)
conditional_effects(understand_bm, effects = "dim_1", categorical = TRUE, conditions = make_conditions(lang_use, "language"))

Speak
speak_bm <- brm(
speak ~
language * dim_1,
data = lang_use,
family = cumulative(),
# Same priors as understand_bm
prior = understand_priors,
backend = "cmdstanr",
cores = 4,
file = "./data/rds/speak_bm"
)
conditional_effects(speak_bm, effects = "dim_1", categorical = TRUE, conditions = make_conditions(speak_bm, "language"))

Read and write
rw_bm <- brm(
read_write ~
language * dim_1,
data = lang_use,
family = bernoulli(),
# Same priors as understand_bm
prior = understand_priors,
backend = "cmdstanr",
cores = 4,
file = "./data/rds/rw_bm"
)
conditional_effects(rw_bm, effects = "dim_1:language")

Mean competence
lang_use %>%
ggplot(aes(mean_comp)) +
geom_density() +
geom_rug()

lang_use %>%
ggplot(aes(language, mean_comp)) +
geom_jitter(width = 0.2, alpha = 0.5, height = 0) +
expand_limits(y = 0)

lang_use %>%
ggplot(aes(language, mean_comp, colour = dim_1)) +
geom_jitter(width = 0.2, alpha = 0.5, height = 0) +
expand_limits(y = 0)

lang_use %>%
ggplot(aes(dim_1, mean_comp)) +
geom_point() +
facet_grid(~ language)

mean_comp_bm <- brm(
mean_comp ~
language *
dim_1,
data = lang_use,
family = zero_one_inflated_beta(),
backend = "cmdstanr",
cores = 4,
file = "./data/rds/mean_comp_bm"
)
mean_comp_bm
## Family: zero_one_inflated_beta
## Links: mu = logit; phi = identity; zoi = identity; coi = identity
## Formula: mean_comp ~ language * dim_1
## Data: lang_use (Number of observations: 578)
## Draws: 4 chains, each with iter = 1000; warmup = 0; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept 0.34 0.04 0.25 0.43 1.00 5510
## languageesperanto 0.95 0.13 0.71 1.20 1.00 5105
## dim_1 0.14 0.06 0.03 0.26 1.00 5129
## languageesperanto:dim_1 -0.13 0.16 -0.45 0.18 1.00 5092
## Tail_ESS
## Intercept 2906
## languageesperanto 3080
## dim_1 3163
## languageesperanto:dim_1 2950
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## phi 4.31 0.27 3.81 4.88 1.00 5900 2633
## zoi 0.22 0.02 0.19 0.25 1.00 6071 2944
## coi 0.99 0.01 0.97 1.00 1.00 4188 1861
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
conditional_effects(mean_comp_bm, effects = "dim_1:language")

Dim-1 and age
lang_use %>%
ggplot(aes(age, dim_1)) +
geom_point() +
geom_smooth(method = "lm")

Rural vs urban
Understand
understand_rur <- emil_rur_clean %>%
count(understand, ru_ur) %>%
group_by(ru_ur) %>%
mutate(
ru_ur_prop = n / sum(n)
) %>%
ungroup() %>%
mutate(
understand = factor(understand, ordered = FALSE),
ru_ur = factor(ru_ur, levels = c("rural", "urban"))
)
get_prior(
n ~
understand * ru_ur,
data = understand_rur,
family = Beta()
)
## prior class coef group resp dpar
## (flat) b
## (flat) b ru_ururban
## (flat) b understand50D50
## (flat) b understand50D50:ru_ururban
## (flat) b understandAL
## (flat) b understandAL:ru_ururban
## (flat) b understandG
## (flat) b understandG:ru_ururban
## (flat) b understandVG
## (flat) b understandVG:ru_ururban
## student_t(3, 0, 2.5) Intercept
## gamma(0.01, 0.01) phi
## nlpar bound source
## default
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## (vectorized)
## default
## default
understand_rur_priors <- c(
prior(normal(0, 3), class = Intercept),
prior(normal(0, 1), class = b)
)
understand_rur_bm <- brm(
ru_ur_prop ~
understand * ru_ur,
data = understand_rur,
family = Beta(),
prior = understand_rur_priors,
backend = "cmdstanr",
cores = 4,
file = "./data/rds/understand_rur_bm"
)
understand_rur_bm
## Family: beta
## Links: mu = logit; phi = identity
## Formula: ru_ur_prop ~ understand * ru_ur
## Data: understand_rur (Number of observations: 9)
## Draws: 4 chains, each with iter = 1000; warmup = 0; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept -1.82 0.60 -2.93 -0.60 1.00 2295
## understandAL -0.47 0.66 -1.79 0.82 1.00 3290
## understand50D50 0.06 0.66 -1.31 1.29 1.00 2970
## understandG 1.07 0.71 -0.42 2.35 1.00 2182
## understandVG 1.38 0.74 -0.17 2.68 1.00 2080
## ru_ururban -0.53 0.56 -1.55 0.62 1.00 2697
## understandAL:ru_ururban 0.32 0.81 -1.29 1.85 1.00 3007
## understand50D50:ru_ururban 0.10 0.78 -1.50 1.57 1.00 3564
## understandG:ru_ururban 0.48 0.74 -1.08 1.87 1.00 3124
## understandVG:ru_ururban 0.57 0.73 -0.88 1.96 1.00 3422
## Tail_ESS
## Intercept 2570
## understandAL 2503
## understand50D50 2824
## understandG 2140
## understandVG 2591
## ru_ururban 2745
## understandAL:ru_ururban 2585
## understand50D50:ru_ururban 3041
## understandG:ru_ururban 2653
## understandVG:ru_ururban 2795
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## phi 14.59 12.89 2.32 48.69 1.00 1439 1408
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
conditional_effects(understand_rur_bm, "understand:ru_ur")

Speak
speak_rur <- emil_rur_clean %>%
count(speak, ru_ur) %>%
group_by(ru_ur) %>%
mutate(
ru_ur_prop = n / sum(n)
) %>%
ungroup() %>%
mutate(
speak = factor(speak, ordered = FALSE),
ru_ur = factor(ru_ur, levels = c("rural", "urban"))
)
speak_rur_bm <- brm(
ru_ur_prop ~
speak * ru_ur,
data = speak_rur,
family = Beta(),
prior = understand_rur_priors,
backend = "cmdstanr",
cores = 4,
file = "./data/rds/speak_rur_bm"
)
speak_rur_bm
## Family: beta
## Links: mu = logit; phi = identity
## Formula: ru_ur_prop ~ speak * ru_ur
## Data: speak_rur (Number of observations: 10)
## Draws: 4 chains, each with iter = 1000; warmup = 0; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept -1.97 0.36 -2.59 -1.18 1.00 1541
## speakAL 0.83 0.47 -0.26 1.61 1.00 1773
## speak50D50 0.11 0.45 -0.87 0.94 1.00 2409
## speakG 1.05 0.47 0.01 1.82 1.00 1456
## speakVG 0.60 0.48 -0.50 1.42 1.00 1639
## ru_ururban -0.22 0.39 -1.00 0.56 1.00 2276
## speakAL:ru_ururban 0.54 0.53 -0.59 1.55 1.00 2737
## speak50D50:ru_ururban 0.52 0.58 -0.73 1.58 1.00 2131
## speakG:ru_ururban -0.02 0.53 -1.08 1.07 1.00 2379
## speakVG:ru_ururban 0.02 0.56 -1.10 1.11 1.00 2700
## Tail_ESS
## Intercept 1655
## speakAL 1909
## speak50D50 2259
## speakG 2106
## speakVG 2279
## ru_ururban 2604
## speakAL:ru_ururban 2566
## speak50D50:ru_ururban 2111
## speakG:ru_ururban 2193
## speakVG:ru_ururban 2523
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## phi 71.40 70.74 7.49 250.05 1.00 619 1074
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
conditional_effects(speak_rur_bm, "speak:ru_ur")

Read and write
rw_rur <- emil_rur_clean %>%
count(read_write, ru_ur) %>%
group_by(ru_ur) %>%
mutate(
ru_ur_prop = n / sum(n)
) %>%
ungroup() %>%
mutate(
ru_ur = factor(ru_ur, levels = c("rural", "urban"))
)
rw_rur_bm <- brm(
ru_ur_prop ~
read_write * ru_ur,
data = rw_rur,
family = Beta(),
prior = understand_rur_priors,
backend = "cmdstanr",
cores = 4,
file = "./data/rds/rw_rur_bm"
)
rw_rur_bm
## Family: beta
## Links: mu = logit; phi = identity
## Formula: ru_ur_prop ~ read_write * ru_ur
## Data: rw_rur (Number of observations: 4)
## Draws: 4 chains, each with iter = 1000; warmup = 0; thin = 1;
## total post-warmup draws = 4000
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS
## Intercept 0.49 0.49 -0.70 1.34 1.00 1987
## read_writeYes -0.99 0.61 -1.96 0.47 1.00 1301
## ru_ururban -0.07 0.54 -1.11 1.08 1.00 2066
## read_writeYes:ru_ururban 0.14 0.65 -1.26 1.29 1.00 1616
## Tail_ESS
## Intercept 1981
## read_writeYes 2029
## ru_ururban 2112
## read_writeYes:ru_ururban 2179
##
## Family Specific Parameters:
## Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
## phi 36.85 49.71 1.94 171.77 1.00 534 902
##
## Draws were sampled using sample(hmc). For each parameter, Bulk_ESS
## and Tail_ESS are effective sample size measures, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
conditional_effects(rw_rur_bm, "read_write:ru_ur")
